{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Logistic Regression Classification"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from pyspark.ml.classification import LogisticRegression\n",
    "from pyspark.ml.evaluation import RegressionEvaluator\n",
    "from pyspark.ml import Pipeline\n",
    "from pyspark.mllib.regression import LabeledPoint\n",
    "from pyspark.ml.linalg import Vectors\n",
    "from pyspark.ml.feature import StringIndexer\n",
    "from pyspark.mllib.evaluation import MulticlassMetrics\n",
    "from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Utility function to create the appropriate data frame for classification algorithms in MLlib"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def mapLibSVM(row): \n",
    "    return (row[5],Vectors.dense(row[:3]))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "create the dataframe from a csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = spark.read \\\n",
    "        .format(\"csv\") \\\n",
    "        .option(\"header\", \"true\") \\\n",
    "        .option(\"inferSchema\", \"true\") \\\n",
    "        .load(\"datasets/iris.data\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Classification algorithms requires numeric values for labels"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "indexer = StringIndexer(inputCol=\"label\", outputCol=\"labelIndex\")\n",
    "indexer = indexer.fit(df).transform(df)\n",
    "indexer.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "dfLabeled = indexer.rdd.map(mapLibSVM).toDF([\"label\", \"features\"])\n",
    "dfLabeled.show()\n",
    "train, test = dfLabeled.randomSplit([0.9, 0.1], seed=12345)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "schema verification"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train.printSchema()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Instantiate the Logistic Regression and the pipeline."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "lr = LogisticRegression(labelCol=\"label\", maxIter=10)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We use a *ParamGridBuilder* to construct a grid of parameters to search over.\n",
    "\n",
    "*TrainValidationSplit* will try all combinations of values and determine best model using the evaluator."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "paramGrid = ParamGridBuilder()\\\n",
    "    .addGrid(lr.regParam, [0.1, 0.001]) \\\n",
    "    .build()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "In this case the estimator is simply the linear regression.\n",
    "\n",
    "A TrainValidationSplit requires an Estimator, a set of Estimator ParamMaps, and an Evaluator."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "tvs = TrainValidationSplit(estimator=lr,\n",
    "                           estimatorParamMaps=paramGrid,\n",
    "                           evaluator=RegressionEvaluator(),\n",
    "                           # 80% of the data will be used for training, 20% for validation.\n",
    "                           trainRatio=0.9)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Fit the pipeline to training documents."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "model = tvs.fit(train)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Compute the predictions from the model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "result = model.transform(test)\n",
    "predictions = result.select([\"prediction\", \"label\"])\n",
    "predictions.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Instantiate metrics object\n",
    "metrics = MulticlassMetrics(predictions.rdd)\n",
    "\n",
    "# Overall statistics\n",
    "print(\"Summary Stats\")\n",
    "print(\"Precision = %s\" % metrics.precision())\n",
    "print(\"Recall = %s\" % metrics.recall())\n",
    "print(\"F1 Score = %s\" % metrics.fMeasure())\n",
    "print(\"Accuracy = %s\" % metrics.accuracy)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Weighted stats\n",
    "print(\"Weighted recall = %s\" % metrics.weightedRecall)\n",
    "print(\"Weighted precision = %s\" % metrics.weightedPrecision)\n",
    "print(\"Weighted F(1) Score = %s\" % metrics.weightedFMeasure())\n",
    "print(\"Weighted F(0.5) Score = %s\" % metrics.weightedFMeasure(beta=0.5))\n",
    "print(\"Weighted false positive rate = %s\" % metrics.weightedFalsePositiveRate)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Apache Toree - PySpark",
   "language": "python",
   "name": "apache_toree_pyspark"
  },
  "language_info": {
   "codemirror_mode": "text/x-ipython",
   "file_extension": ".py",
   "mimetype": "text/x-ipython",
   "name": "python",
   "pygments_lexer": "python",
   "version": "3.6.3\n"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
